Get target meta data incl. number of downstream genes
target_df <- read.csv(file.path(OutFolder, "7b_target_summary", paste0(date, "_target_meta_data.csv")))
target_df <- dplyr::filter(target_df, gene != "NonTarget")
Number of Effects in total
target_df %>%
dplyr::filter(n_cells >= min_cells_per_gene) %>%
ggplot(aes(x=n_downstream, fill = group)) + geom_histogram() + facet_wrap(~group) +
theme_classic() + scale_fill_manual(values = cols4Pools) +
xlab("Number of genes affected by knockdown") + scale_x_log10() +
guides(fill = "none")
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2011 rows containing non-finite values (stat_bin).

target_df_mincells <- filter(target_df, !is.na(n_downstream_excl_target))
nrow(target_df_mincells)
## [1] 6673
gg <- target_df_mincells %>%
ggplot(aes(x=pre_day10_lfc_mean, y = n_downstream_excl_target, col = group)) +
geom_pointdensity() +
scale_y_log10() + theme_bw() + guides(col = "none") +
ylab("# DEGs") +
xlab("Growth phenotype (day 10 LFC)") +
scale_color_manual(values = cols4Pools) +
ggpubr::stat_cor(col = "black", cor.coef.name = "rho", method = "spearman") +
geom_smooth(method = "lm", col = "darkgrey", lty = "dashed", se = F)
ggExtra::ggMarginal(gg, groupColour = TRUE, type = "histogram", groupFill = TRUE, position = 'dodge')
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 3300 rows containing non-finite values (stat_pointdensity).
## Warning: Removed 3300 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3300 rows containing non-finite values (stat_smooth).
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 3300 rows containing non-finite values (stat_pointdensity).
## Warning: Removed 3300 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3300 rows containing non-finite values (stat_smooth).

Top Hits
df4plot <- target_df %>% slice_max(order_by = n_downstream_excl_target, n = 25)
ggplot(df4plot, aes(y = reorder(gene, n_downstream_excl_target), x = n_downstream_excl_target, fill = group)) +
xlab('Number of Downstream DE Genes in Trans') + ylab('') +
scale_fill_manual(values = cols4Pools) +
geom_bar(stat = 'identity') + theme_bw()

## do strong and moderate
df4plot <- split.data.frame(target_df, f = target_df$group)
df4plot <- mapply(df4plot, FUN = function(df){
rtn <- df %>% slice_max(order_by = n_downstream_excl_target, n = 25) %>%
mutate(group = ifelse(group == "Moderate", "Non-Fitness Genes", "Fitness Genes"))
p <- ggplot(rtn, aes(y = reorder(gene, n_downstream_excl_target), x = n_downstream_excl_target, fill = group)) +
facet_wrap(~group) +
xlab('Number of Downstream DE Genes in Trans') + ylab('') +
scale_fill_manual(values = cols4Pools_Renamed) +
geom_bar(stat = 'identity') + theme_bw() + theme(legend.position = 'none')
return(p)
}, SIMPLIFY = F)
ggpubr::ggarrange(plotlist = df4plot, ncol = 2)

#target_df %>% slice_max(order_by = n_downstream_excl_target, n = 25)
Add variances
# TODO this should be in step 7b and part of the target_df
# sc_variance is contained, but not variance at bulk level
var_donor <- read.table(file.path(OutFolder, "Preprocess_External_Datasets", "coexpression", paste0("variance_in_expression_across_donors_magpie-", date, "-version.tsv")))
var_sc_tab <- read.table(file.path(OutFolder, "Preprocess_External_Datasets", "coexpression", paste0("sc_regressed_variance_per_gene_magpie-", date, "-version.tsv")), sep = "\t", skip = 1)
var_sc <- var_sc_tab$V2
names(var_sc) <- var_sc_tab$V1
stopifnot(all(names(var_sc) == rownames(var_donor)))
df_vars <- cbind(var_donor, var_sc) %>% rownames_to_column("target") %>% mutate(target_short = gsub(".*?:", "",
gsub(":Gene-Expression", "",target)))
stopifnot(!any(duplicated(df_vars$target_short)))
target_df <- left_join(target_df,df_vars, by =c("gene" = "target_short"))
all(target_df$sc_variance == target_df$var_sc, na.rm = TRUE)
## [1] TRUE
Add conservation scores
# TODO this should be in step 7b and part of the target_df
cons_scores <- read.csv(file.path(ResourcesFolder,
"conservation_scores",
"conservation_scores_phastCons100way.UCSC.hg38.csv"))
target_df <- left_join(target_df, cons_scores, by = c("gene" = "GeneSymbol"))
table(is.na(target_df$Conservation))
##
## FALSE TRUE
## 6671 555
Correct mean expression
# TODO: not needed any more with newer runs of the pipeline
if(params$date == "2022-05-31") {
tmp <- fread("/lustre/scratch123/hgi/projects/crispri_scrnaseq/outs/Magpie/6b_calc_lfcs_transcriptome_wide_by_gene/2022-05-31_combined_with_adj_pval_REORDERED.tsv.gz")
df_mean_expr <- tmp %>% group_by(downstream_gene) %>% summarise(control_norm_expr_corrected = unique(control_norm_expr))
target_df <- left_join(target_df, df_mean_expr, by = c("target" = "downstream_gene"))
} else {
target_df$control_norm_expr_corrected <- target_df$control_norm_expr
}
Plot correlation matrix
X <- target_df %>% dplyr::filter(!is.na(n_downstream_excl_target)) %>%
dplyr::select(n_downstream_excl_target,
n_cells,
mean_effect_depmap,
var_effect_depmap,
growth_lfc_d3 = pre_day3_lfc_mean,
growth_lfc_d4 = pre_day4_lfc_mean,
growth_lfc_d5 = pre_day5_lfc_mean,
growth_lfc_d9 = pre_day9_lfc_mean,
growth_lfc_d10 = pre_day10_lfc_mean,
n_coess_modules,
mean_expression_control = control_norm_expr_corrected,
n_PPIs = num_omnipath_interactions,
n_protein_complexes = num_omnipath_protein_complexes,
n_hallmark_gene_sets = num_msigdb_gene_sets,
n_trans_effects_eQTLs = num_eQTLs_cis_gene,
heritability = mean_var_expl_by_donor_kilpinen,
target_lfc,
var_across_donors,
var_sc) %>%
as.matrix()
dim(X)
## [1] 6673 19
n_targets_av <- apply(X,2, function(c) sum(!is.na(c))) # targets per annotation
n_targets_av
## n_downstream_excl_target n_cells mean_effect_depmap
## 6673 6673 6533
## var_effect_depmap growth_lfc_d3 growth_lfc_d4
## 6533 6673 6673
## growth_lfc_d5 growth_lfc_d9 growth_lfc_d10
## 6673 6673 6673
## n_coess_modules mean_expression_control n_PPIs
## 6026 4874 6673
## n_protein_complexes n_hallmark_gene_sets n_trans_effects_eQTLs
## 6673 6673 6673
## heritability target_lfc var_across_donors
## 4418 4874 6033
## var_sc
## 6033
summary(n_targets_av)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4418 6033 6673 6249 6673 6673
sum(apply(is.na(X),1, function(r) sum(r))==0) # targets with all annotations
## [1] 2968
cor_mat <- cor(X, method = "spearman", use = "pairwise.complete")
my_heatmap(t(cor_mat), show_colnames = F, min_c= -1, max_c = 1, midpoint = 0)

cor(X[,c("n_downstream_excl_target", "growth_lfc_d10")], method = "spearman")
## n_downstream_excl_target growth_lfc_d10
## n_downstream_excl_target 1.0000000 -0.2867884
## growth_lfc_d10 -0.2867884 1.0000000
cor_mat["n_downstream_excl_target", "growth_lfc_d10"] # from 2B
## [1] -0.2867884
order_vars <- order(cor_mat["n_downstream_excl_target",colnames(cor_mat) != "n_downstream_excl_target"])
names_var <- names(cor_mat["n_downstream_excl_target",colnames(cor_mat) != "n_downstream_excl_target"])[order_vars]
pvals_cor_test <- sapply(names_var,
function(nm) cor.test(X[,nm], X[,"n_downstream_excl_target"], method = "spearman")$p.value)
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(X[, nm], X[, "n_downstream_excl_target"], method =
## "spearman"): Cannot compute exact p-value with ties
padj <- p.adjust(pvals_cor_test, method = "BH")
names(padj)[padj < 0.05]
## [1] "growth_lfc_d10" "growth_lfc_d9"
## [3] "mean_effect_depmap" "growth_lfc_d5"
## [5] "growth_lfc_d4" "var_across_donors"
## [7] "target_lfc" "var_sc"
## [9] "n_hallmark_gene_sets" "growth_lfc_d3"
## [11] "n_coess_modules" "mean_expression_control"
## [13] "n_protein_complexes" "var_effect_depmap"
## [15] "n_cells"
stopifnot(names(padj) == names_var)
nicer_names <- data.frame(
old_name = c("n_downstream_excl_target", "var_effect_depmap", "n_protein_complexes",
"conservation_score", "mean_expression_control",
"n_coess_modules", "n_trans_effects_eQTLs",
"n_PPIs", "heritability", "growth_lfc_d3",
"n_hallmark_gene_sets",
"var_across_donors",
"n_cells",
"growth_lfc_d4",
"var_sc",
"target_lfc",
"growth_lfc_d5",
"growth_lfc_d9",
"growth_lfc_d10",
"mean_effect_depmap"),
new_name = c("n_downstream_excl_target", "Variance of Effect Across DepMap Lines", "Number of Protein Complexes",
"Conservation Score", "Mean Expression (Control Cells)",
"Number of Coessentiality Modules", "Number of Trans Effects (eQTLs)", "Number of Protein Interactions", "Heritability",
"Growth LFC (Day 3)", "Number of Hallmark Gene Sets",
"Variance of Expression (Bulk across Donors)", "Number of Cells", "Growth LFC (Day 4)", "Variance of Expression (Single-Cell)",
"LFC of Target Gene","Growth LFC (Day 5)", "Growth LFC (Day 9)",
"Growth LFC (Day 10)", "Mean Effect in DepMap")
)
colnames(cor_mat) <- nicer_names$new_name[match(row.names(cor_mat), nicer_names$old_name)]
my_heatmap(cor_mat["n_downstream_excl_target", nicer_names$new_name[match(names_var, nicer_names$old_name)], drop = FALSE],
cluster_rows = FALSE, cluster_cols = FALSE, show_rownames = FALSE,
display_numbers = t(as.matrix(ifelse(padj<0.001,"***", ifelse(padj<0.01,"**", ifelse(padj<0.05,"*",""))))))

## exclude some stuff
#df4plot <- cor_mat["n_downstream_excl_target", nicer_names$new_name[match(names_var, nicer_names$old_name)]]
#my_heatmap(, drop = FALSE],
# cluster_rows = FALSE, cluster_cols = FALSE, show_rownames = FALSE,
# display_numbers = t(as.matrix(ifelse(padj<0.001,"***", ifelse(padj<0.01,"**", #ifelse(padj<0.05,"*",""))))))
for(nm in names_var){
print(qplot(X[,nm], X[, "n_downstream_excl_target"]) +
geom_pointdensity() +
ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
xlab(nm) + ylab("# DEGs") + theme_bw() +
scale_y_log10())
}


















knitr::opts_chunk$set(eval = FALSE)
Growth effect
target_df %>%
select(n_downstream_excl_target, group, starts_with("growth_lfc")) %>%
gather(starts_with("growth_lfc"), key = "day", value = LFC_growth) %>%
mutate(day = factor(day, levels = paste0("growth_lfc_d",1:11))) %>%
ggplot(aes(x= LFC_growth, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_grid(day~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("LFC in growth screen") + ylab("Number of genes affected") +
guides(col = "none")
ggplot(target_df, aes(x= group, y=n_downstream_excl_target, fill = group)) + geom_boxplot() +
theme_classic() + scale_fill_manual(values = cols4Pools) +
xlab("LFC of target gene") + ylab("Number of genes affected") +
guides(fill = "none") + scale_y_log10() + ggpubr::stat_compare_means()
Number of cells
ggplot(target_df, aes(x= n_cells, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Number of cells / target") + ylab("Number of genes affected")
guides(fill = "none")
Target expression
ggplot(target_df, aes(x= mean_expression_control, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Expression of target") + ylab("Number of genes affected")
guides(fill = "none")
Target LFC
ggplot(target_df, aes(x= target_lfc, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("LFC of target gene") + ylab("Number of genes affected")
guides(fill = "none")
Variance
ggplot(target_df, aes(x= var_across_donors, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Target variance (pseudo-bulk, Cuomo et al)") + ylab("Number of genes affected")
guides(fill = "none")
ggplot(target_df, aes(x= var_sc, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Target variance (single-cell, Cuomo et al)") + ylab("Number of genes affected")
guides(fill = "none")
Heritability
ggplot(target_df, aes(x= heritability, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Heritability (Kilpinen et al)") + ylab("Number of genes affected") +
guides(fill = "none")
Conservation
ggplot(target_df, aes(x= conservation_phastCons100way, y=n_downstream_excl_target, col = group)) +
geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("conservation score (phastCons100way)") + ylab("Number of genes affected") +
guides(fill = "none")
Interactions
ggplot(target_df, aes(x= num_omnipath_interactions, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("# Omnipath interactions") + ylab("Number of genes affected") +
guides(fill = "none")
ggplot(target_df, aes(x= num_msigdb_gene_sets, y=n_downstream_excl_target, col = group)) + geom_point() +
facet_wrap(~group) +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("# MSigDB gene set annotated in") + ylab("Number of genes affected") +
guides(fill = "none")
ggplot(target_df, aes(x= num_omnipath_protein_complexes>0, y=n_downstream_excl_target, col = group)) + #geom_boxplot() +
facet_wrap(~group) + stat_summary() +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Is in protein complex?") + ylab("Number of genes affected") +
guides(fill = "none")
ggplot(target_df, aes(x= num_eQTLs_genes>0, y=n_downstream_excl_target, col = group)) + #geom_boxplot() +
facet_wrap(~group) + stat_summary() +
theme_classic() + scale_color_manual(values = cols4Pools) +
xlab("Is in eQTL gene ?") + ylab("Number of genes affected") +
guides(fill = "none")
DEG vs. Growth Phenotype
gg <- target_df %>%
ggplot(aes(x=growth_lfc_d10, y = n_downstream, col = group)) +
geom_pointdensity() +
scale_y_log10() + theme_bw() + guides(col = "none") +
ylab("Number of DEGs (padj < 0.1, |LFC| > 0.1)") +
xlab("Growth phenotype (day 10 LFC)") +
scale_color_manual(values = cols4Pools)
ggExtra::ggMarginal(gg, groupColour = TRUE, type = "histogram", groupFill = TRUE, position = 'dodge')